# -*- coding: utf-8 -*-
"""Deep_Learning_sensitivity_paper_2023.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1eJAcpyPHVwC3o_yFxyI-amwR9_pgvZgI
"""

!pip install tensorflow==2.14.1
!pip install keras==2.14.0
!pip install eli5
!pip install shap
!pip install scikeras

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline

# libraries & dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
sns.set_theme(style="white")

import keras
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from keras import optimizers

# Create a History callback to record training history
from keras.callbacks import History

history_callback = History()

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

from scipy.stats import zscore
from scipy.stats import f_oneway


import eli5
from eli5.sklearn import PermutationImportance

# libraries & dataset
from string import ascii_letters
import shap

# print the JS visualization code to the notebook
shap.initjs()

# Read CSV file into a DataFrame
raw_dataset = pd.read_csv('database.csv', sep=',')
df = raw_dataset.copy()

# Filter data for Descriptor T30 and STI
filtered_df = df[df['Descriptor'].isin(['T30', 'STI'])]

# Map Descriptor values to units
unit_mapping = {'T30': 'seconds [s]', 'STI': 'dimensionless'}
filtered_df['Descriptor'] = filtered_df['Descriptor'].map(unit_mapping)

# Define a scientific color palette
scientific_palette = ['#1f77b4', '#ff7f0e']

# Set plotting style to whitegrid
sns.set_style("whitegrid")

# Create a boxplot with data distribution
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x="Classroom_id",
                 y="Value",
                 hue="Descriptor",
                 data=filtered_df,
                 palette=scientific_palette, width=0.5)

# Add strip plot for data distribution
sns.stripplot(x="Classroom_id",
              y="Value",
              hue="Descriptor",
              data=filtered_df,
              color='#ff7f0e',
              dodge=True,
              alpha=0.5)

# Set x-axis label
ax.set_xlabel("Classroom ID")

# Set y-axis label with unit information
ax.set_ylabel(f"T30 / STI ({unit_mapping['T30']} / {unit_mapping['STI']})")

# Add legend with descriptor names and set STI box to be orange
legend = ax.legend(title='Descriptor',
                   loc='upper right',
                   labels=['T30', 'STI'])

legend.legendHandles[1].set_facecolor('#ff7f0e')  # Set color for STI box

# Remove grid lines
ax.yaxis.grid(False)

# Show the plot
plt.show()

# Generate dataset
raw_dataset = pd.read_csv('/content/data_to_correlation.csv', sep=',')
d = raw_dataset.copy()
df.tail()

# Compute the correlation matrix
corr = d.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1.0, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Commented out IPython magic to ensure Python compatibility.
from scipy.stats import linregress


def auto_sensitivy_analisys(raw_dataset_file, x_label, y_label):

  # Load entire dataset for training
  raw_dataset = pd.read_csv(raw_dataset_file, sep=',')
  df = raw_dataset.copy()
  df.tail()

  # Set up variables to the tranings variables
  df = df.sort_index()

  df = df[y_label + x_label]
  df = df.dropna()
  df.tail()

  # Split the data into test and training sets.
  np.random.seed(100)
  X_train, X_test, y_train, y_test = train_test_split(df[x_label],
                                                      df[y_label],
                                                      test_size=0.01)
  # Print the dimensions
  print('Training set dimensions X, y: ' + str(X_train.shape) + ' ' +str(y_train.shape))
  print('Test set dimensions X, y: ' + str(X_test.shape) + ' '+ str(y_test.shape))


  # Define regression model in Keras
  def DL_regression_model():
      model = Sequential()
      model.add(Dense(8, input_dim=4, activation='relu'))
      model.add(Dense(4, activation='relu'))
      model.add(Dense(2, activation='relu'))
      model.add(Dense(1, activation='linear'))

      # Compile model
      adam = optimizers.Adam(0.01)
      model.compile(loss='mean_squared_error',
                    optimizer=adam,
                    metrics=['mse'])

      return model

  # O parâmetro patience é o quantidade de epochs para checar as melhoras
  early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',
                                             patience=5)

  estimator = KerasRegressor(build_fn=DL_regression_model,
                            validation_split = 0.1,
                            batch_size=2,
                            loss='mse',
                            epochs=20,
                            verbose=0,
                            callbacks=[early_stop,
                                       history_callback]
                            )

  history = estimator.fit(X_train, y_train)

  print(DL_regression_model)

  isPlot = False

  if isPlot == True:
   # Plot training & validation loss values

   plt.plot(history_callback.history['loss'])
   plt.plot(history_callback.history['val_loss'])
   plt.title('Model loss')
   plt.xlabel('Epoch')
   plt.ylabel('Loss')
   plt.legend(['Train', 'Validation'], loc='upper right')
   plt.show()


  isCross = False

  if isCross == True:
    # Cross validation

    kfold = KFold(n_splits=4)
    result = cross_val_score(estimator,
                            X_train.values,
                            y_train.values,
                            cv=kfold,
                            n_jobs=1)

    print("Results: %.2f (%.2f) MSE" % (result.mean(), result.std()))

    # Assuming 'result' is the array of cross-validation results
    cv_results = pd.DataFrame({'Fold': range(1, len(result) + 1), 'MSE': result})

    # Display the table
    print(cv_results)

  # Fitting resuts

  fitted = estimator.predict(X_train)
  residuals = y_train[y_label[0]].values - fitted

  # Two plots
  fig, (ax1, ax2) = plt.subplots(ncols=2,figsize=(12,6))

  # 1. Histogram of residuals
  sns.distplot(residuals, ax=ax1)
  ax1.set_title('Histogram of residuals')

  # Fitted vs residuals
  x1 = pd.Series(fitted.reshape(15), name='Fitted total')
  x2 = pd.Series(y_train[y_label[0]], name="total values")

  #sns.kdeplot(x1, n_levels=5,ax = ax2)
  sns.regplot(x=x1, y=x2, scatter=False, ax = ax2)
  ax2.set_title('Fitted vs actual values')
  ax2.set_xlim([0,1])
  ax2.set_ylim([0,1])
  ax2.set_aspect('equal')

  # Calculate linear DL model regression parameters
  DL_slope, intercept, r_value, p_value, std_err = linregress(y_train[y_label[0]].values,
                                                              fitted.reshape(15))

  # Print the slope coefficient
  print(f"Slope Coefficient: {DL_slope}")

  plt.show()

  # Permutation model
  perm = PermutationImportance(estimator,
                               random_state=8).fit(X_train,y_train)

  perm_importances = eli5.explain_weights_df(perm, feature_names=X_train.columns.tolist())

  # SHAP expects model functions to take a 2D numpy array as input,
  # so we define a wrapper function around the original Keras predict function.
  def f_wrapper(X):
      return estimator.predict(X).flatten()

  X_train_summary = shap.kmeans(X_train, 10)
  X_train_summary = X_train_summary.data.astype(int)

  # Compute Shap values
  explainer = shap.KernelExplainer(f_wrapper, X_train_summary)

  # Create subplots
  plt.figure(1)
  # Make plot with combined shap values
  X_train_sample = X_train.sample(10)
  shap_values  = explainer.shap_values(X_train_sample)
  shap.summary_plot(shap_values, X_train_sample)
  plt.show()

  print(shap_values)

  column_means = shap_values.max(axis=0)

  print("Shape values", column_means.sort)
  print("Shape values", np.sort(column_means))

  if isPlot == True:
   # Dependence plots
   shap.dependence_plot(0, shap_values, X_train_sample)
   shap.dependence_plot(1, shap_values, X_train_sample)
   shap.dependence_plot(2, shap_values, X_train_sample)
   shap.dependence_plot(3, shap_values, X_train_sample)

  shape_result = np.mean(shap_values, axis=0)
  print("Shap values final", shape_result)

  # Create linear regression object
  regr = linear_model.LinearRegression()

  # Train the model using the training sets
  regr.fit(X_train, y_train)

  # Make predictions using the testing set
  y_pred = regr.predict(X_train)

  # The coefficients
  print('Coefficients: \n', regr.coef_)
  # The mean squared error
  print('Mean squared error: %.2f'
#         % mean_squared_error(y_train, y_pred))
  # The coefficient of determination: 1 is perfect prediction
  print('Coefficient of determination: %.2f'
#         % r2_score(y_train, y_pred))

  return y_label[0], perm_importances['weight'].values, shape_result, regr.coef_[0], DL_slope

raw_dataset.columns.values

x_label = ['A',	'B',	'C',	'D']
raw_dataset_file = '/content/training_dataset.csv'
result = []

for class_id in raw_dataset.columns.values:
    print(f"Evaluating classroom: {class_id}")

    while True:
        partial_resul = auto_sensitivy_analisys(raw_dataset_file, x_label, [class_id])
        if partial_resul[2].mean() != 0 or partial_resul[4] > 0.9:
            result.append(partial_resul[:-1])
            break  # Exit the loop if the condition is met

# result

df= pd.DataFrame(result)

# Flatten the nested arrays
labels = ['A', 'B', 'C', 'D']
df_expanded = pd.concat([df[0], df[1].apply(pd.Series).rename(lambda x: f'Shape {labels[x]}', axis=1),
                        df[2].apply(pd.Series).rename(lambda x: f'Perm {labels[x]}', axis=1),
                        df[3].apply(pd.Series).rename(lambda x: f'RL {labels[x]}', axis=1)], axis=1)

# Display the expanded DataFrame
df_expanded= df_expanded.rename(columns={0: 'CLASSROOM'})
df_expanded

# Filter T30 and STI classrooms
df = df_expanded
df_t30 = df[df['CLASSROOM'].str.contains('T30')]
df_sti = df[df['CLASSROOM'].str.contains('STI')]

# Select T30 and STI parameter columns for each method
shape_columns = ['Shape A', 'Shape B', 'Shape C', 'Shape D']
perm_columns = ['Perm A', 'Perm B', 'Perm C', 'Perm D']
rl_columns = ['RL A', 'RL B', 'RL C', 'RL D']

# Extract T30 and STI parameter values for each method
shape_t30 = df_t30.loc[:, ['CLASSROOM'] + shape_columns]
perm_t30 = df_t30.loc[:, ['CLASSROOM'] + perm_columns]
rl_t30 = df_t30.loc[:, ['CLASSROOM'] + rl_columns]

shape_sti = df_sti.loc[:, ['CLASSROOM'] + shape_columns]
perm_sti = df_sti.loc[:, ['CLASSROOM'] + perm_columns]
rl_sti = df_sti.loc[:, ['CLASSROOM'] + rl_columns]

# Set up the figure and axis with subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))

# Plot bar plots for Shape, Perm, RL
shape_t30.set_index('CLASSROOM').plot(kind='bar', ax=axes[0, 0], rot=0, title='Shape - T30')
perm_t30.set_index('CLASSROOM').plot(kind='bar', ax=axes[1, 0], rot=0, title='Perm - T30')
rl_t30.set_index('CLASSROOM').plot(kind='bar', ax=axes[2, 0], rot=0, title='RL - T30')

shape_sti.set_index('CLASSROOM').plot(kind='bar', ax=axes[0, 1], rot=0, title='Shape - STI')
perm_sti.set_index('CLASSROOM').plot(kind='bar', ax=axes[1, 1], rot=0, title='Perm - STI')
rl_sti.set_index('CLASSROOM').plot(kind='bar', ax=axes[2, 1], rot=0, title='RL - STI')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

# Extract columns for each group
shape_columns = ['Shape A', 'Shape B', 'Shape C', 'Shape D']
perm_columns = ['Perm A', 'Perm B', 'Perm C', 'Perm D']
rl_columns = ['RL A', 'RL B', 'RL C', 'RL D']

# Create new DataFrames for each group
df_shape = df_expanded[shape_columns].apply(zscore, axis=1)
df_perm = df_expanded[perm_columns].apply(zscore, axis=1)
df_rl = df_expanded[rl_columns].apply(zscore, axis=1)

# Rename the columns with the respective group and z-score
df_shape.columns = [f'Shape_{i}_zscore' for i in range(len(df_shape.columns))]
df_perm.columns = [f'Perm_{i}_zscore' for i in range(len(df_perm.columns))]
df_rl.columns = [f'RL_{i}_zscore' for i in range(len(df_rl.columns))]

# Concatenate the DataFrames
df_zscores = pd.concat([df_shape, df_perm, df_rl], axis=1)

# Display the new DataFrame with z-scores
df_zscores

def perform_anova_and_decision(dataframe, group_columns, significance_level=0.05):
    """
    Perform one-way ANOVA on the specified groups in the dataframe.

    Parameters:
    - dataframe: pandas DataFrame
        The DataFrame containing the z-scores for different groups.
    - group_columns: list
        List of column names representing the groups.
    - significance_level: float, optional (default=0.05)
        The significance level for the hypothesis test.

    Returns:
    - result: str
        A string indicating whether to accept or reject the null hypothesis.
    """
    # Perform ANOVA for the specified groups
    anova_result = f_oneway(*[dataframe[col] for col in group_columns])

    # Check the p-value against the significance level
    p_value = anova_result.pvalue

    if p_value < significance_level:
        return f"Reject the null hypothesis. (p-value: {p_value:.4f})"
    else:
        return f"Accept the null hypothesis. (p-value: {p_value:.4f})"

# Example usage:
# Assuming df_zscores has been created in the previous code
shape_columns = [f'Shape_{i}_zscore' for i in range(4)]
perm_columns = [f'Perm_{i}_zscore' for i in range(4)]
rl_columns = [f'RL_{i}_zscore' for i in range(4)]

print("ANOVA Decision for Shape:", perform_anova_and_decision(df_zscores, shape_columns))
print("ANOVA Decision for Perm:", perform_anova_and_decision(df_zscores, perm_columns))
print("ANOVA Decision for RL:", perform_anova_and_decision(df_zscores, rl_columns))

# In ANOVA, the null hypothesis is that the means of the groups are equal

# Calculate the correlation matrix
correlation_matrix = df_expanded.corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Create a heatmap using seaborn with the lower triangular mask
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, mask=mask)

# Show the plot
plt.show()

!pip freeze

import tensorflow as tf
import datetime, os

# Load entire dataset for training

x_label = ['A',	'B',	'C',	'D']
raw_dataset_file = '/content/training_dataset.csv'
y_label = ['Class 1 - T30']


raw_dataset = pd.read_csv(raw_dataset_file, sep=',')
df = raw_dataset.copy()
df.tail()

# Set up variables to the tranings variables
df = df.sort_index()

df = df[y_label + x_label]
df = df.dropna()
df.tail()

# Split the data into test and training sets.
np.random.seed(100)
X_train, X_test, y_train, y_test = train_test_split(df[x_label],
                                                    df[y_label],
                                                    test_size=0.01)
# Print the dimensions
print('Training set dimensions X, y: ' + str(X_train.shape) + ' ' +str(y_train.shape))
print('Test set dimensions X, y: ' + str(X_test.shape) + ' '+ str(y_test.shape))

from keras import backend as K

# Do some code, e.g. train and save model

K.clear_session()

# Define regression model in Keras
def DL_regression_model():
    model = Sequential()
    model.add(Dense(8, input_dim=4, activation='relu'))
    model.add(Dense(4, activation='relu'))
    model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation='linear'))

    return model


def train_model():

  tf.keras.backend.clear_session()
  history_callback = History()

  model = DL_regression_model()

  # Compile model
  adam = optimizers.Adam(0.01)
  model.compile(loss='mean_squared_error',
                optimizer=adam,
                metrics=['mse'])

  logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

  # O parâmetro patience é o quantidade de epochs para checar as melhoras
  early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',
                                             patience=5)

  tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir,
                                                        histogram_freq=1)

  model.fit(x=X_train,
            y=y_train,
            validation_split = 0.1,
            batch_size=2,
            epochs=5,
            validation_data=(X_test, y_test),
            callbacks=[early_stop,
                       history_callback,
                       tensorboard_callback])

  # Plot training & validation loss values
  plt.plot(history_callback.history['loss'])
  plt.plot(history_callback.history['val_loss'])
  plt.title('Model loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend(['Train', 'Validation'], loc='upper right')
  plt.show()


train_model()

DL_regression_model().summary()

from tensorflow.keras.utils import plot_model
plot_model(DL_regression_model(), to_file='model.png')

keras.utils.model_to_dot(
    DL_regression_model(),
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=False,
    dpi=200,
    subgraph=False,
    show_layer_activations=False,
    show_trainable=False,
)

from tensorboard import notebook
notebook.list() # View open TensorBoard instances

# Control TensorBoard display. If no port is provided,
# the most recently launched TensorBoard is used
notebook.display(port=6006, height=1000)

# Commented out IPython magic to ensure Python compatibility.
# Load the TensorBoard notebook extension
# %load_ext tensorboard

# Commented out IPython magic to ensure Python compatibility.
# %tensorboard --logdir logs